#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
#else

static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
                         THNNState *state,
                         THTensor *input,
                         THTensor *gradOutput,
                         int kT,
                         int kW,
                         int kH,
                         int dT,
                         int dW,
                         int dH,
                         int padT,
                         int padW,
                         int padH,
                         bool ceil_mode)
{
  long nslices;
  long itime;
  long iheight;
  long iwidth;
  long otime;
  long oheight;
  long owidth;
  int ndim = input->nDimension;
  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;

  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }

  THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
             "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
             kT, kH, kW);
  THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
             "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
             dT, dH, dW);
  THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
                "4D or 5D (batch mode) tensor expected for input, but got: %s");

  THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
             && input->size[dimt] >= kT, 2,
             "input image (T: %d H: %d W: %d) smaller than "
             "kernel size (kT: %d kH: %d kW: %d)",
             input->size[dimt], input->size[dimh], input->size[dimw],
             kT, kH, kW);

  // The second argument is argNumber... here is the index of padH.
  THArgCheck(kT/2 >= padT && kW/2 >= padW && kH/2 >= padH, 11,
            "pad should not be greater than half of kernel size, but got "
            "padT = %d, padW = %d, padH = %d, kT = %d, kW = %d, kH = %d",
            padT, padW, padH, kT, kW, kH);

  /* sizes */
  nslices = input->size[dimN];
  itime   = input->size[dimt];
  iheight = input->size[dimh];
  iwidth  = input->size[dimw];

  if (ceil_mode) {
    otime   = (long)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
  }
  else
  {
    otime   = (long)(floor((float)(itime   - kT + 2*padT) / dT)) + 1;
    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
  }

  if (padT || padW || padH)
  {
    // ensure that the last pooling starts inside the image
    // needed to avoid problems in ceil mode
    if ((otime   - 1)*dT >= itime   + padT)
      --otime;
    if ((oheight - 1)*dH >= iheight + padH)
      --oheight;
    if ((owidth  - 1)*dW >= iwidth  + padW)
      --owidth;
  }

  if (otime < 1 || owidth < 1 || oheight < 1)
    THError("Given input size: (%dx%dx%dx%d). "
	    "Calculated output size: (%dx%dx%dx%d). Output size is too small",
            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);

  if (gradOutput != NULL) {
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
    THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
  }
}

static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
          real *input_p,
          real *output_p,
          long nslices,
          long itime,
          long iwidth,
          long iheight,
          long otime,
          long owidth,
          long oheight,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int padT,
          int padW,
          int padH,
          bool count_include_pad)
{
  long k;
#pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    long i, j, ti;

    /* local pointers. */
    real *ip = input_p + k * itime * iwidth * iheight;
    real *op = output_p + k * otime * owidth * oheight;
    for (i = 0; i < otime * oheight * owidth; ++i)
      *(op + i) = 0;

    /* loop over output */
    for (ti = 0; ti < otime; ti++)
    {
      for (i = 0; i < oheight; i++)
      {
        for (j = 0; j < owidth; j++)
        {
          /* compute pool range. */
          long tstart = ti * dT - padT;
          long hstart = i  * dH - padH;
          long wstart = j  * dW - padW;
          long tend = fminf(tstart + kT, itime + padT);
          long hend = fminf(hstart + kH, iheight + padH);
          long wend = fminf(wstart + kW, iwidth + padW);
          long pool_size = (tend - tstart) * (hend - hstart) * (wend - wstart);
          tstart = fmaxf(tstart, 0);
          hstart = fmaxf(hstart, 0);
          wstart = fmaxf(wstart, 0);
          tend = fmin(tend, itime);
          hend = fmin(hend, iheight);
          wend = fmin(wend, iwidth);

          int divide_factor;
          if (count_include_pad)
            divide_factor = pool_size;
          else
            divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);

          /* compute local sum: */
          real sum = 0.0;
          long x, y, z;

          for (z = tstart; z < tend; z++)
          {
            for (y = hstart; y < hend; y++)
            {
              for (x = wstart; x < wend; x++)
              {
                sum +=  *(ip + z * iwidth * iheight + y * iwidth + x);
              }
            }
          }

          /* set output to local max */
          *op++ += sum / divide_factor;
        }
      }
    }
  }
}

void THNN_(VolumetricAveragePooling_updateOutput)(
          THNNState *state,
          THTensor *input,
          THTensor *output,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int padT,
          int padW,
          int padH,
          bool ceil_mode,
          bool count_include_pad)
{
  long nslices;
  long itime;
  long iheight;
  long iwidth;
  long otime;
  long oheight;
  long owidth;
  real *input_data;
  real *output_data;

  THNN_(VolumetricAveragePooling_shapeCheck)(
        state, input, NULL, kT, kW, kH,
        dT, dW, dH, padT, padW, padH, ceil_mode);

  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;

  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }

  /* sizes */
  nslices = input->size[dimN];
  itime   = input->size[dimt];
  iheight = input->size[dimh];
  iwidth  = input->size[dimw];
  if (ceil_mode)
  {
    otime   = (long)(ceil((float)(itime   - kT + 2*padT) / dT)) + 1;
    oheight = (long)(ceil((float)(iheight - kH + 2*padH) / dH)) + 1;
    owidth  = (long)(ceil((float)(iwidth  - kW + 2*padW) / dW)) + 1;
  }
  else
  {
    otime   = (long)(floor((float)(itime   - kT + 2*padT) / dT)) + 1;
    oheight = (long)(floor((float)(iheight - kH + 2*padH) / dH)) + 1;
    owidth  = (long)(floor((float)(iwidth  - kW + 2*padW) / dW)) + 1;
  }
  if (padT || padH || padW)
  {
    // ensure that the last pooling starts inside the image
    // needed to avoid problems in ceil mode
    if ((otime   - 1)*dT >= itime   + padT)
      --otime;
    if ((oheight - 1)*dH >= iheight + padH)
      --oheight;
    if ((owidth  - 1)*dW >= iwidth  + padW)
      --owidth;
  }

  /* get contiguous input */
  input = THTensor_(newContiguous)(input);

  if (input->nDimension == 4) /* non-batch mode */
  {
    /* resize output */
    THTensor_(resize4d)(output, nslices, otime, oheight, owidth);

    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);

    THNN_(VolumetricAveragePooling_updateOutput_frame)(
      input_data, output_data, nslices,
      itime, iwidth, iheight,
      otime, owidth, oheight,
      kT, kW, kH,
      dT, dW, dH,
      padT, padW, padH,
      count_include_pad
    );
  }
  else  /* batch mode */
  {
    long p;
    long nBatch = input->size[0];

    long istride = nslices * itime * iwidth * iheight;
    long ostride = nslices * otime * owidth * oheight;

    /* resize output */
    THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);

    input_data = THTensor_(data)(input);
    output_data = THTensor_(data)(output);

#pragma omp parallel for private(p)
    for (p=0; p < nBatch; p++)
    {
      THNN_(VolumetricAveragePooling_updateOutput_frame)(
        input_data + p * istride, output_data + p * ostride, nslices,
        itime, iwidth, iheight,
        otime, owidth, oheight,
        kT, kW, kH,
        dT, dW, dH,
        padT, padW, padH,
        count_include_pad
      );
    }
  }

  /* cleanup */
  THTensor_(free)(input);
}

static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
          real *gradInput_p,
          real *gradOutput_p,
          long nslices,
          long itime,
          long iwidth,
          long iheight,
          long otime,
          long owidth,
          long oheight,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int padT,
          int padW,
          int padH,
          bool count_include_pad)
{
  long k;
#pragma omp parallel for private(k)
  for (k = 0; k < nslices; k++)
  {
    long i, j, ti;

    /* local pointers */
    real *ip = gradInput_p + k * itime * iwidth * iheight;
    real *op = gradOutput_p + k * otime * owidth * oheight;
    for (i = 0; i < itime*iwidth*iheight; i++)
      *(ip + i) = 0;

    /* loop over output */
    for (ti = 0; ti < otime; ti++)
    {
      for (i = 0; i < oheight; i++)
      {
        for (j = 0; j < owidth; j++)
        {
          long tstart = ti * dT - padT;
          long hstart = i  * dH - padH;
          long wstart = j  * dW - padW;
          long tend = fminf(tstart + kT, itime + padT);
          long hend = fminf(hstart + kH, iheight + padH);
          long wend = fminf(wstart + kW, iwidth + padW);
          long pool_size = (tend -tstart) * (hend - hstart) * (wend - wstart);
          tstart = fmaxf(tstart, 0);
          hstart = fmaxf(hstart, 0);
          wstart = fmaxf(wstart, 0);
          tend = fminf(tend, itime);
          hend = fminf(hend, iheight);
          wend = fminf(wend, iwidth);

          long divide_factor;
          if (count_include_pad)
            divide_factor = pool_size;
          else
            divide_factor = (tend - tstart) * (hend - hstart) * (wend - wstart);

          /* scatter gradients out to footprint: */
          real val  = *op++;

          long x,y,z;
          for (z = tstart; z < tend; z++)
          {
            for (y = hstart; y < hend; y++)
            {
              for (x = wstart; x < wend; x++)
              {
                *(ip + z * iheight * iwidth + y * iwidth + x) += val / divide_factor;
              }
            }
          }
        }
      }
    }
  }
}

void THNN_(VolumetricAveragePooling_updateGradInput)(
          THNNState *state,
          THTensor *input,
          THTensor *gradOutput,
          THTensor *gradInput,
          int kT,
          int kW,
          int kH,
          int dT,
          int dW,
          int dH,
          int padT,
          int padW,
          int padH,
          bool ceil_mode,
          bool count_include_pad)
{
  long nslices;
  long itime;
  long iheight;
  long iwidth;
  long otime;
  long oheight;
  long owidth;
  real *gradInput_data;
  real *gradOutput_data;

  int dimN = 0;
  int dimt = 1;
  int dimh = 2;
  int dimw = 3;

  THNN_(VolumetricAveragePooling_shapeCheck)(
        state, input, gradOutput, kT, kW, kH,
        dT, dW, dH, padT, padW, padH, ceil_mode);

  /* get contiguous gradOutput */
  gradOutput = THTensor_(newContiguous)(gradOutput);

  /* resize */
  THTensor_(resizeAs)(gradInput, input);
  THTensor_(zero)(gradInput);

  if (input->nDimension == 5)
  {
    dimN++;
    dimt++;
    dimh++;
    dimw++;
  }

  /* sizes */
  nslices = input->size[dimN];
  itime = input->size[dimt];
  iheight = input->size[dimh];
  iwidth = input->size[dimw];
  otime = gradOutput->size[dimt];
  oheight = gradOutput->size[dimh];
  owidth = gradOutput->size[dimw];

  /* get raw pointers */
  gradInput_data = THTensor_(data)(gradInput);
  gradOutput_data = THTensor_(data)(gradOutput);

  /* backprop */
  if (input->nDimension == 4) /* non-batch mode*/
  {
    THNN_(VolumetricAveragePooling_updateGradInput_frame)(
      gradInput_data, gradOutput_data, nslices,
      itime, iwidth, iheight,
      otime, owidth, oheight,
      kT, kW, kH,
      dT, dW, dH,
      padT, padW, padH,
      count_include_pad
    );
  }
  else /* batch mode */
  {
    long p;
    long nBatch = input->size[0];

    long istride = nslices * itime * iwidth * iheight;
    long ostride = nslices * otime * owidth * oheight;

#pragma omp parallel for private(p)
    for (p = 0; p < nBatch; p++)
    {
      THNN_(VolumetricAveragePooling_updateGradInput_frame)(
        gradInput_data  + p * istride, gradOutput_data + p * ostride, nslices,
        itime, iwidth, iheight,
        otime, owidth, oheight,
        kT, kW, kH,
        dT, dW, dH,
        padT, padW, padH,
        count_include_pad
      );
    }
  }

  /* cleanup */
  THTensor_(free)(gradOutput);
}

#endif
